library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.4.2 ✔ dplyr 0.7.4
## ✔ tidyr 0.8.0 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggrepel)
library(ggcorrplot)
library(gmp)
##
## Attaching package: 'gmp'
## The following objects are masked from 'package:base':
##
## %*%, apply, crossprod, matrix, tcrossprod
library(GGally)
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
bad_strains <- c('YBR084C-A', 'YBR098W', 'YCR036W', 'YCR044C', 'YCR077C', 'YHR090C',
'YJL117W', 'YJL190C', 'YKR024C', 'YKR062W', 'YML008C', 'YML051W', 'YMR231W',
'YNL330C', 'YPR072W', 'YDL192W', 'YDL082W', 'YDR443C', 'YDL135C', 'YDL135C')
##### bad strains are library strains to be removed (according to Amelie's file, and Hannes's instruction, those should be removed)
### hannes_averaged4 E-MAP score data for Gsp1 mutants
#### read it in and make it tidy
e.map <- read_tsv("basic_E-MAP_data/avg_merged_June2016_screen_for_Gia.txt", col_names = T) ## use export for Gia because it has ORF names for library
## Parsed with column specification:
## cols(
## .default = col_double(),
## Gene = col_character()
## )
## See spec(...) for full column specifications.
e.map <- e.map %>% gather(library, score, -Gene) %>%
filter(! library %in% bad_strains) %>%
arrange(desc(Gene), desc(library)) %>%
mutate("Gene" = gsub(Gene, pattern = "GSP1:", replacement = "", perl = T))
#### these are all the gsp1 mutants screened (including WT-NAT and 3xFLAG constructs)
### 59 query constructs all together
(Gsp1_queries <- unique(e.map[["Gene"]]))
## [1] "Y157A" "Y148I" "T34Y" "T34S"
## [5] "T34Q" "T34N" "T34L" "T34G"
## [9] "T34E" "T34D" "T34A" "T139R"
## [13] "T139A" "T137G" "R78K" "R112S"
## [17] "R112A" "R108Y" "R108S" "R108Q"
## [21] "R108L" "R108I" "R108G" "R108D"
## [25] "R108A" "Q147L" "Q147E" "NTER3XFLAG_WT"
## [29] "N84Y" "N105V" "N105L" "N102M"
## [33] "N102K" "N102I" "K169I" "K154M"
## [37] "K143Y" "K143W" "K143H" "K132H"
## [41] "K129T" "K129I" "K129F" "K129E"
## [45] "K101R" "H141V" "H141R" "H141I"
## [49] "H141E" "GSP1-NAT" "G80A" "F58L"
## [53] "F58A" "E115I" "E115A" "D79S"
## [57] "D79A" "CTER3XFLAG_WT" "A180T"
### ORF to gene name annotation from SGD
orf_gene_name_index <- read_delim("orf_gene_GO_sgd_annotation.txt", col_names = F, delim = "\t")
## Parsed with column specification:
## cols(
## X1 = col_character(),
## X2 = col_character(),
## X3 = col_character()
## )
orf_index <- unique(tibble("orf" = orf_gene_name_index$X1, "gene_name" = orf_gene_name_index$X2))
rm(orf_gene_name_index)
#############################
### Gsp1 mutants merged with ubermap (merged with ubermap if there is an at least 100 library genes overlap between
# chromatin library and the library against which that query was screened)
# (gene names) - this is only necessary to distinguish point mutants (they are all the same in the ORF file)
gene_names.ubermap <- read_delim("basic_E-MAP_data/20180108_gene_names_merge_w_Ubermap_100.txt",
delim = "\t", col_names = T, n_max = length(Gsp1_queries))
## Parsed with column specification:
## cols(
## .default = col_double(),
## Gene = col_character()
## )
## See spec(...) for full column specifications.
gene_names.ubermap <- gene_names.ubermap %>% gather(library.ORF, score, -Gene) %>%
mutate("Gene" = gsub(Gene, pattern = "GSP1 - ", replacement = "", perl = T))
mutants <- unique(gene_names.ubermap$Gene)
rm(gene_names.ubermap)
### Gsp1 mutants merged with ubermap (orf names)
# replace Gsp1 ORFs with mutation names (because they are all the same orf!)
ubermap <- read_delim("basic_E-MAP_data/20180108_orf_names_merge_w_Ubermap_100.txt", col_names = T, delim = "\t")
## Parsed with column specification:
## cols(
## .default = col_double(),
## Gene = col_character()
## )
## See spec(...) for full column specifications.
ubermap[ 1:length(mutants), 1 ] <- mutants
#### can't gather before sorting out those few Genes that occur multiple times (add a unique identifier to them)
### first find those Genes that are not unique (probably data from separate experiments/labs for the same thing?
## Stll better not to mix them up)
(not_uniq <- ubermap %>%
group_by(Gene) %>%
summarize("count" = n()) %>%
filter(count > 1) %>%
ungroup()
)
## # A tibble: 10 x 2
## Gene count
## <chr> <int>
## 1 YDL155W - YDL155W 2
## 2 YDR113C - YDR113C 4
## 3 YER008C - YER008C 2
## 4 YGR038W - YGR038W 3
## 5 YGR108W - YGR108W 2
## 6 YGR109C - YGR109C 2
## 7 YJL085W - YJL085W 2
## 8 YLR350W - YLR350W 3
## 9 YPR119W - YPR119W 2
## 10 YPR120C - YPR120C 2
#### check correlations between non-unique samples
not_uniq_uber <- ubermap %>%
filter(Gene %in% not_uniq$Gene) %>%
arrange(Gene) %>%
mutate("Gene" = str_c(Gene, 1:24, sep = "_"))
not_uniq_matrix <- as.matrix(not_uniq_uber[, -1])
rownames(not_uniq_matrix) <- not_uniq_uber$Gene
cormat <- round(cor(t(not_uniq_matrix[, -1]), use = "pairwise.complete.obs"), 2)
####### correlations between duplicate data doesn't look that good
cormat %>% ggcorrplot(hc.order = TRUE, outline.col = "white",insig = "blank", tl.cex = 7) +
ggtitle("Pearson correlations between repeated screens")

### these correlations look really bad
## check if this is due to the small overlap in libraries
not_uniq_uber_gather <- not_uniq_uber %>%
gather(library.ORF, score, -Gene) %>%
mutate("Gene.ORF" = gsub(Gene, pattern = " - .*", replacement = "", perl = T))
non_uniq_gene_ORFS <- not_uniq_uber_gather %>% pull(Gene.ORF) %>% unique()
plots <- list()
for (i in seq_along(non_uniq_gene_ORFS)) {
plots[[i]] <- not_uniq_uber_gather %>%
filter(Gene.ORF == non_uniq_gene_ORFS[i]) %>%
select(-Gene.ORF) %>%
spread(Gene, score) %>%
select(-library.ORF) %>%
ggpairs()
}
print(plots)
## [[1]]
## Warning: Removed 376 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 750 rows containing missing values
## Warning: Removed 750 rows containing missing values (geom_point).
## Warning: Removed 750 rows containing non-finite values (stat_density).

##
## [[2]]
## Warning: Removed 767 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 784 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 767 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 782 rows containing missing values
## Warning: Removed 784 rows containing missing values (geom_point).
## Warning: Removed 766 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 766 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 779 rows containing missing values
## Warning: Removed 767 rows containing missing values (geom_point).
## Warning: Removed 766 rows containing missing values (geom_point).
## Warning: Removed 749 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 765 rows containing missing values
## Warning: Removed 782 rows containing missing values (geom_point).
## Warning: Removed 779 rows containing missing values (geom_point).
## Warning: Removed 765 rows containing missing values (geom_point).
## Warning: Removed 765 rows containing non-finite values (stat_density).

##
## [[3]]
## Warning: Removed 900 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 972 rows containing missing values
## Warning: Removed 972 rows containing missing values (geom_point).
## Warning: Removed 260 rows containing non-finite values (stat_density).

##
## [[4]]
## Warning: Removed 901 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 901 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 901 rows containing missing values
## Warning: Removed 901 rows containing missing values (geom_point).
## Warning: Removed 454 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 899 rows containing missing values
## Warning: Removed 901 rows containing missing values (geom_point).
## Warning: Removed 899 rows containing missing values (geom_point).
## Warning: Removed 899 rows containing non-finite values (stat_density).

##
## [[5]]
## Warning: Removed 137 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 789 rows containing missing values
## Warning: Removed 789 rows containing missing values (geom_point).
## Warning: Removed 750 rows containing non-finite values (stat_density).

##
## [[6]]
## Warning: Removed 398 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 775 rows containing missing values
## Warning: Removed 775 rows containing missing values (geom_point).
## Warning: Removed 761 rows containing non-finite values (stat_density).

##
## [[7]]
## Warning: Removed 910 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 1076 rows containing missing values
## Warning: Removed 1076 rows containing missing values (geom_point).
## Warning: Removed 507 rows containing non-finite values (stat_density).

##
## [[8]]
## Warning: Removed 891 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 892 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 891 rows containing missing values
## Warning: Removed 892 rows containing missing values (geom_point).
## Warning: Removed 65 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 892 rows containing missing values
## Warning: Removed 891 rows containing missing values (geom_point).
## Warning: Removed 892 rows containing missing values (geom_point).
## Warning: Removed 891 rows containing non-finite values (stat_density).

##
## [[9]]
## Warning: Removed 749 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 749 rows containing missing values
## Warning: Removed 749 rows containing missing values (geom_point).
## Warning: Removed 67 rows containing non-finite values (stat_density).

##
## [[10]]
## Warning: Removed 547 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 904 rows containing missing values
## Warning: Removed 904 rows containing missing values (geom_point).
## Warning: Removed 746 rows containing non-finite values (stat_density).

##### add a number in front of all the non-unique Gene
for (temp.gene in not_uniq$Gene) {
row.names <- which(ubermap$Gene == temp.gene)
for (i in seq_along(row.names)) {
ubermap$Gene[row.names[i]] <- str_c(i, ubermap$Gene[row.names[i]], sep = "_")
}
}
### now those few non-unique Genes look like this: 1_YDL155W - YDL155W
#### check
filter(ubermap, Gene == "1_YDL155W - YDL155W" | Gene == "2_YDL155W - YDL155W")
## # A tibble: 2 x 1,357
## Gene YAL002W YAL007C YAL010C YAL011W YAL013W YAL019W YAL020C YAL021C
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1_YDL15… -1.03 NA 0.653 0.349 -0.633 0.514 0.133 0.592
## 2 2_YDL15… 1.30 NA NA NA NA -1.39 NA NA
## # ... with 1,348 more variables: YAL023C <dbl>, YAL024C <dbl>,
## # YAL026C <dbl>, YAL027W <dbl>, YAL040C <dbl>, YAL042W <dbl>,
## # YAL048C <dbl>, YAL051W <dbl>, YAL053W <dbl>, YAL055W <dbl>,
## # `YAR002C-A` <dbl>, YAR002W <dbl>, YAR003W <dbl>, YAR014C <dbl>,
## # YAR018C <dbl>, YBL007C <dbl>, YBL008W <dbl>, YBL011W <dbl>,
## # YBL017C <dbl>, YBL021C <dbl>, YBL024W <dbl>, YBL027W <dbl>,
## # YBL032W <dbl>, YBL036C <dbl>, YBL046W <dbl>, YBL047C <dbl>,
## # YBL051C <dbl>, YBL052C <dbl>, YBL058W <dbl>, YBL061C <dbl>,
## # YBL066C <dbl>, YBL067C <dbl>, YBL072C <dbl>, YBL075C <dbl>,
## # YBL078C <dbl>, YBL079W <dbl>, YBL081W <dbl>, YBL082C <dbl>,
## # YBL085W <dbl>, YBL086C <dbl>, YBL087C <dbl>, YBL088C <dbl>,
## # YBL101C <dbl>, YBL103C <dbl>, YBR009C <dbl>, YBR010W <dbl>,
## # YBR015C <dbl>, YBR022W <dbl>, YBR023C <dbl>, YBR025C <dbl>,
## # YBR031W <dbl>, YBR034C <dbl>, YBR036C <dbl>, YBR041W <dbl>,
## # YBR057C <dbl>, YBR058C <dbl>, YBR061C <dbl>, YBR065C <dbl>,
## # YBR069C <dbl>, YBR073W <dbl>, YBR076W <dbl>, YBR078W <dbl>,
## # YBR082C <dbl>, YBR083W <dbl>, YBR088C_DAmP <dbl>, YBR093C <dbl>,
## # YBR094W <dbl>, YBR095C <dbl>, YBR098W <dbl>, YBR103W <dbl>,
## # YBR106W <dbl>, YBR107C <dbl>, YBR108W <dbl>, YBR114W <dbl>,
## # YBR119W <dbl>, YBR125C <dbl>, YBR126C <dbl>, YBR131W <dbl>,
## # YBR132C <dbl>, YBR135W_DAmP <dbl>, YBR141C <dbl>, YBR150C <dbl>,
## # YBR156C <dbl>, YBR159W <dbl>, YBR160W_DAmP <dbl>, YBR162C <dbl>,
## # `YBR162W-A` <dbl>, YBR164C <dbl>, YBR168W <dbl>, YBR171W <dbl>,
## # YBR175W <dbl>, YBR181C <dbl>, YBR186W <dbl>, YBR193C_DAmP <dbl>,
## # YBR194W <dbl>, YBR195C <dbl>, YBR200W <dbl>, YBR201W <dbl>,
## # YBR212W <dbl>, YBR215W <dbl>, …
(not_uniq <- ubermap %>% group_by(Gene) %>%
summarize("count" = n()) %>% filter(count > 1))
## # A tibble: 0 x 2
## # ... with 2 variables: Gene <chr>, count <int>
# now I can gather the data
ubermap <- ubermap %>% gather(library.ORF, score, -Gene)
## mutants only
ubermap.mutants.only <- ubermap %>% filter( Gene %in% mutants) %>%
#mutate("library.ORF" = gsub(library.ORF, pattern = "\\.", replacement = "-", perl = T)) %>%
### this is to replace the YOR298C.A with YOR298C-A (R puts . instead of - when loading library genes as column names)
### deprecated step now that we have tidyverse!
add_column("ORF" = "YLR293C") %>%
rename("Gene_uniq" = Gene) %>%
mutate("Gene.gene_name" = "GSP1") %>%
inner_join(., orf_index, by = c("library.ORF" = "orf")) %>%
rename("library.gene_name" = gene_name)
# save the mutant only data
write_tsv(ubermap.mutants.only, path = "basic_E-MAP_data/preprocessed_ubermap_mut_only.txt")
ubermap.mutants.only %>%
ggplot(., mapping = aes(x = score)) +
geom_density() + ggtitle("Distribution of E-MAP scores in Gsp1 mutants")
## Warning: Removed 865 rows containing non-finite values (stat_density).

#### look at the distribution of scores for Gsp1 mutants
ubermap.mutants.only %>%
ggplot(ubermap.mutants.only,
mapping = aes(x = score,
y = reorder(Gene_uniq, score, FUN = function(x) quantile(x, prob = 0.05, na.rm = T)))) +
geom_boxplot() +
ylab("Mutants ordered by the 5th E-MAP score percentile") +
xlab("E-MAP score")
## Warning: Removed 865 rows containing non-finite values (stat_boxplot).
## Warning: position_dodge requires non-overlapping x intervals

ubermap.per.mutant.95conf <- ubermap.mutants.only %>%
group_by(Gene_uniq) %>%
summarize(lower_quant = quantile(score, probs = .025, na.rm = T),
upper_quant = quantile(score, probs = .975, na.rm = T),
upper_whisker = boxplot(score, plot = F)$stats[5, ],
lower_whisker = boxplot(score, plot = F)$stats[1, ]) %>%
mutate(category = factor(Gene_uniq, levels = Gene_uniq)) %>%
arrange(lower_quant)
ubermap.per.mutant.95conf_strong <- ubermap.per.mutant.95conf %>%
filter(lower_quant < -3)
ubermap.per.mutant.95conf %>%
ggplot(., mapping = aes(x = lower_quant, y = upper_quant)) + geom_point() +
geom_label_repel(data = ubermap.per.mutant.95conf_strong, aes(label = Gene_uniq)) +
xlab("E-MAP score - lower 2.5 %") + ylab("E-MAP score - higher 97.5 %")

ubermap.per.mutant.95conf %>%
ggplot(., mapping = aes(x = lower_whisker, y = upper_whisker)) + geom_point() +
geom_label_repel(data = ubermap.per.mutant.95conf_strong, aes(label = Gene_uniq)) +
xlab("E-MAP score - lower whisker") + ylab("E-MAP score - upper whisker")

#ubermap.mutants.only <- ubermap.mutants.only %>%
# mutate(category = factor(Gene, levels = ubermap.per.mutant.95conf$Gene)) %>% #### turn into factors so I can order by the lower quartile
# arrange(category)
## tidy up all but mutants
ubermap.ubergenes.only <- ubermap %>% filter(! (Gene %in% mutants) ) %>%
#### this is to make sure things like YLR337C - DELTA YLR338W - YLR337C - DELTA YLR338W are not collapse to just YLR337C in the next step
mutate("Gene" = gsub(Gene, pattern = " - DELTA", replacement = " -- DELTA", perl = T)) %>%
# this replacement is to remove the redundant ORF before the unique ORF (e.g. YFL008W_TSQ321 - YFL008W_TSQ321 -> keep only the first one) - > first one has the unique i_ for the few redundant ones!
mutate("Gene" = gsub(Gene, pattern = " - .+$", replacement = "", perl = T))
### this is to replace the YOR298C.A with YOR298C-A (R puts . instead of - when loading library genes as column names) - deprecated with tidyverse
#mutate("Gene" = gsub(Gene, pattern = "\\.", replacement = "-", perl = T))
(deltaORFS <- ubermap.ubergenes.only %>%
filter(., grepl("DELTA", Gene)) %>%
pull(Gene) %>% unique())
## [1] "YDR134C -- DELTA YDR133C" "YKL119C -- DELTA YKL118W"
## [3] "YLR337C -- DELTA YLR338W" "YLR262C -- DELTA YLR261C"
## [5] "YPR075C -- DELTA YPR075C" "YOL051W -- DELTA YOL050C"
## [7] "YJL176C -- DELTA YJL175W" "YNL227C -- DELTA YNL228W"
## [9] "YJL168C -- DELTA YJL169W" "YPL113C -- DELTA YPL114W"
## [11] "YNL236W -- DELTA YNL235C" "YNL297C -- DELTA YNL296W"
### most of the DELTA ORFs are dubious reading frames
### I will just use the first ORF for all of them
ubermap.ubergenes.only %>%
filter(Gene == "YDR134C -- DELTA YDR133C") %>%
mutate("ORF" = gsub(Gene, pattern = "^[0-9]+_", replacement = "", perl = T))
## # A tibble: 1,356 x 4
## Gene library.ORF score ORF
## <chr> <chr> <dbl> <chr>
## 1 YDR134C -- DELTA YDR133C YAL002W 1.13 YDR134C -- DELTA YDR133C
## 2 YDR134C -- DELTA YDR133C YAL007C NA YDR134C -- DELTA YDR133C
## 3 YDR134C -- DELTA YDR133C YAL010C NA YDR134C -- DELTA YDR133C
## 4 YDR134C -- DELTA YDR133C YAL011W 3.33 YDR134C -- DELTA YDR133C
## 5 YDR134C -- DELTA YDR133C YAL013W 1.48 YDR134C -- DELTA YDR133C
## 6 YDR134C -- DELTA YDR133C YAL019W 0.842 YDR134C -- DELTA YDR133C
## 7 YDR134C -- DELTA YDR133C YAL020C NA YDR134C -- DELTA YDR133C
## 8 YDR134C -- DELTA YDR133C YAL021C NA YDR134C -- DELTA YDR133C
## 9 YDR134C -- DELTA YDR133C YAL023C NA YDR134C -- DELTA YDR133C
## 10 YDR134C -- DELTA YDR133C YAL024C NA YDR134C -- DELTA YDR133C
## # ... with 1,346 more rows
#### in this step I make a column that has only the basic ORF - this is necessary so I can match to SGD gene name
ubermap.ubergenes.only <- ubermap.ubergenes.only %>%
rename("Gene_uniq" = "Gene") %>%
mutate("ORF" = gsub(Gene_uniq, pattern = "^[0-9]+_", replacement = "", perl = T)) %>%
mutate("ORF" = gsub(ORF, pattern = "_.+$", replacement = "", perl = T)) %>%
mutate("ORF" = gsub(ORF, pattern = " -- DELTA .+", replacement = "", perl = T))
ubermap.ubergenes.only %>%
filter(ORF == "YKL119C")
## # A tibble: 2,712 x 4
## Gene_uniq library.ORF score ORF
## <chr> <chr> <dbl> <chr>
## 1 YKL119C -- DELTA YKL118W YAL002W -1.54 YKL119C
## 2 YKL119C YAL002W -6.82 YKL119C
## 3 YKL119C -- DELTA YKL118W YAL007C NA YKL119C
## 4 YKL119C YAL007C 0.428 YKL119C
## 5 YKL119C -- DELTA YKL118W YAL010C NA YKL119C
## 6 YKL119C YAL010C NA YKL119C
## 7 YKL119C -- DELTA YKL118W YAL011W 1.96 YKL119C
## 8 YKL119C YAL011W -4.44 YKL119C
## 9 YKL119C -- DELTA YKL118W YAL013W 1.02 YKL119C
## 10 YKL119C YAL013W NA YKL119C
## # ... with 2,702 more rows
#### matching to SGD gene names
ubergenes.ubermap.gene_names <- inner_join(ubermap.ubergenes.only, orf_index, by = c("ORF" = "orf")) %>%
rename("Gene.gene_name" = "gene_name")
ubergenes.ubermap.gene_names_both <- inner_join(ubergenes.ubermap.gene_names, orf_index, by = c("library.ORF" = "orf")) %>%
rename("library.gene_name" = "gene_name")
ubergenes.ubermap.gene_names_both %>%
filter(ORF == "YKL119C")
## # A tibble: 2,650 x 6
## Gene_uniq library.ORF score ORF Gene.gene_name library.gene_na…
## <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 YKL119C -- D… YAL002W -1.54 YKL1… VPH2 VPS8
## 2 YKL119C YAL002W -6.82 YKL1… VPH2 VPS8
## 3 YKL119C -- D… YAL007C NA YKL1… VPH2 ERP2
## 4 YKL119C YAL007C 0.428 YKL1… VPH2 ERP2
## 5 YKL119C -- D… YAL010C NA YKL1… VPH2 MDM10
## 6 YKL119C YAL010C NA YKL1… VPH2 MDM10
## 7 YKL119C -- D… YAL011W 1.96 YKL1… VPH2 SWC3
## 8 YKL119C YAL011W -4.44 YKL1… VPH2 SWC3
## 9 YKL119C -- D… YAL013W 1.02 YKL1… VPH2 DEP1
## 10 YKL119C YAL013W NA YKL1… VPH2 DEP1
## # ... with 2,640 more rows
write_tsv(ubergenes.ubermap.gene_names_both, path = "basic_E-MAP_data/preprocessed_ubermap_ubergenes_only.txt")
########### 95% confidence for WT (GSP1-NAT)
(wt <- ubermap.per.mutant.95conf %>%
filter(Gene_uniq == "GSP1-NAT") %>%
select(-Gene_uniq, -category))
## # A tibble: 1 x 4
## lower_quant upper_quant upper_whisker lower_whisker
## <dbl> <dbl> <dbl> <dbl>
## 1 -1.20 1.05 1.09 -1.26
#### 95 % confidence for all the mutants data
(average.95conf <- ubermap.mutants.only %>%
summarize(lower_quant = quantile(score, probs = .025, na.rm = T),
upper_quant = quantile(score, probs = .975, na.rm = T),
upper_whisker = boxplot(score, plot = F)$stats[5, ],
lower_whisker = boxplot(score, plot = F)$stats[1, ]))
## # A tibble: 1 x 4
## lower_quant upper_quant upper_whisker lower_whisker
## <dbl> <dbl> <dbl> <dbl>
## 1 -2.89 2.23 1.59 -1.65
(s.lim.point<-c(average.95conf$lower_quant, average.95conf$upper_quant))
## 2.5% 97.5%
## -2.891910 2.225918
plot <- ggplot(ubermap.mutants.only, aes(x = score)) +
facet_wrap( ~ category) +
geom_density() +
geom_vline(data = ubermap.per.mutant.95conf, aes(xintercept = lower_quant), size = 1) +
geom_vline(data = ubermap.per.mutant.95conf, aes(xintercept = upper_quant), size = 1) +
geom_vline(data = wt, aes(xintercept = lower_quant), color = "red", size = 1) +
geom_vline(data = wt, aes(xintercept = upper_quant), color = "red", size = 1) +
geom_vline(data = average.95conf, aes(xintercept = lower_quant), color = "green", size = 1) +
geom_vline(data = average.95conf, aes(xintercept = upper_quant), color = "green", size = 1) +
geom_vline(data = ubermap.per.mutant.95conf, aes(xintercept = lower_whisker), color = "yellow", size = 1) +
geom_vline(data = ubermap.per.mutant.95conf, aes(xintercept = upper_whisker), color = "yellow", size = 1) +
geom_vline(data = wt, aes(xintercept = lower_whisker), color = "orange", size = 1) +
geom_vline(data = wt, aes(xintercept = upper_whisker), color = "orange", size = 1) +
geom_vline(data = average.95conf, aes(xintercept = lower_whisker), color = "blue", size = 1) +
geom_vline(data = average.95conf, aes(xintercept = upper_whisker), color = "blue", size = 1)
print(plot)
## Warning: Removed 51035 rows containing non-finite values (stat_density).

pdf(file = "Gsp1_mutants_score_distributions.pdf", width = 14)
print(plot)
## Warning: Removed 51035 rows containing non-finite values (stat_density).
dev.off()
## quartz_off_screen
## 2
####### What is the distribution of scores for the WT construct?
ubermap.mutants.only %>% filter(Gene_uniq == "GSP1-NAT") %>%
ggplot(aes(x = score)) + geom_histogram() + ggtitle("Distribution of scores for GSP1-NAT")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 7 rows containing non-finite values (stat_bin).

all.library.genes <- ubermap.mutants.only %>%
select(library.ORF) %>% unique() %>% pull()
(length(all.library.genes))
## [1] 1325
##### What is the distribution of scores for the library genes that have a score of lower than 10 in at least one mutant?
highest_mut_emap_score_lib_genes <- ubermap.mutants.only %>%
filter(score < -10) %>%
pull(library.gene_name) %>% unique()
length(highest_mut_emap_score_lib_genes)
## [1] 81
ubermap.mutants.only %>%
filter(library.gene_name %in% highest_mut_emap_score_lib_genes) %>%
ggplot(aes(x = score)) +
facet_wrap(~library.gene_name) +
geom_histogram() + ggtitle("Distribution of scores for most negative library genes")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 5 rows containing non-finite values (stat_bin).

#### get library genes with the most broad score distributions (for Gsp1 mutants genetic interactions)
mut_lib_genes_score_range <- ubermap.mutants.only %>%
group_by(library.ORF) %>%
do(setNames(data.frame(t(range(.$score, na.rm = T))), c("score_min", "score_max"))) %>%
mutate("score_range" = score_max - score_min) %>%
arrange(desc(score_range))
mut_lib_genes_score_range %>%
ggplot(., aes(x = score_range)) + geom_density()

### there are 304 library gene with a score range larger of 7.5
(mut_lib_genes_score_range_7.5 <- mut_lib_genes_score_range %>%
filter(score_range > 7.5) %>%
inner_join(., orf_index, by = c("library.ORF" = "orf")) %>%
rename("library.gene_name" = "gene_name"))
## # A tibble: 304 x 5
## # Groups: library.ORF [304]
## library.ORF score_min score_max score_range library.gene_name
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 YLR018C -19.2 5.03 24.3 POM34
## 2 YDR395W -17.4 5.84 23.3 SXM1
## 3 YHR167W -17.1 4.88 22.0 THP2
## 4 YBL079W -17.2 4.18 21.4 NUP170
## 5 YDR363W-A -17.4 3.87 21.3 SEM1
## 6 YMR198W -17.5 3.71 21.3 CIK1
## 7 YMR153W -18.1 2.33 20.4 NUP53
## 8 YMR272C -15.7 4.69 20.4 SCS7
## 9 YML062C -16.9 3.46 20.3 MFT1
## 10 YPL018W -17.1 2.99 20.1 CTF19
## # ... with 294 more rows
write_tsv(mut_lib_genes_score_range_7.5, path = "library_genes_with_score_range_over_7.5_in_Gsp1_mut_screens.txt")
######### SIGNIFICANT LIBRARY GENES are defined as as genes that have an E-MAP score outside of the s.lim.point
## with at least one of the Gsp1 point mutants:
# #### make a subset of ubergenes.ubermap.gene_names_both that only has
## library genes that have scores outside the s.lim.point with at least one of the mutants
significant.library.genes <- ubermap.mutants.only %>%
filter(findInterval(score, s.lim.point) != 1) %>%
select(library.ORF) %>% unique() %>% pull()
(length(significant.library.genes))
## [1] 977
########### QUESTION: Should I remove from the list of significant.library.genes those that have significant scores with GSP1-NAT (wt control)
### One could assume that those genetic interaction are an artifact of the construct
(sig.interactions.in.wt <- ubermap.mutants.only %>%
filter(Gene_uniq == "GSP1-NAT" & findInterval(score, s.lim.point) != 1) %>%
inner_join(., orf_index, by = c("library.ORF" = "orf")) %>%
inner_join(., mut_lib_genes_score_range, by = "library.ORF"))
## # A tibble: 7 x 10
## Gene_uniq library.ORF score ORF Gene.gene_name library.gene_name
## <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 GSP1-NAT YBL079W 2.51 YLR293C GSP1 NUP170
## 2 GSP1-NAT YDR159W 3.78 YLR293C GSP1 SAC3
## 3 GSP1-NAT YDR363W-A 3.37 YLR293C GSP1 SEM1
## 4 GSP1-NAT YHR167W 4.81 YLR293C GSP1 THP2
## 5 GSP1-NAT YKR082W 4.43 YLR293C GSP1 NUP133
## 6 GSP1-NAT YML062C 2.66 YLR293C GSP1 MFT1
## 7 GSP1-NAT YML103C 3.12 YLR293C GSP1 NUP188
## # ... with 4 more variables: gene_name <chr>, score_min <dbl>,
## # score_max <dbl>, score_range <dbl>
###### there is only 7 library genes that are outside of s.lim.point for the WT construct, and 3 of those are nuclear pore proteins
#### they all also have very large score distributions in mutants
###### I would assume it's not a good idea to discard those
##### how does the distribution of scores look like for those library genes ?
ubermap.mutants.only %>%
filter(library.gene_name %in% sig.interactions.in.wt$library.gene_name) %>%
ggplot(aes(x = score)) +
facet_wrap(~library.gene_name) + geom_histogram() + ggtitle("Distribution of E-MAP scores")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (stat_bin).

###### NOTE: SEM1, SAC3, MFT1 and THP2 are part of THO/TREX complexes
##### Is this relevant?
#### In any case, those 7 library genes all have very broad distribution of scores in mutants,
#### so don't discard any of them
(ubergenes.ubermap.gene_names_both_sig <- ubergenes.ubermap.gene_names_both %>%
filter(library.ORF %in% significant.library.genes))
## # A tibble: 4,364,259 x 6
## Gene_uniq library.ORF score ORF Gene.gene_name library.gene_na…
## <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 YLR268W YAL002W 1.28 YLR26… SEC22 VPS8
## 2 YOR011W YAL002W -0.137 YOR01… AUS1 VPS8
## 3 YAL008W YAL002W NA YAL00… FUN14 VPS8
## 4 YOR043W YAL002W -0.915 YOR04… WHI2 VPS8
## 5 YBR255W YAL002W -1.89 YBR25… MTC4 VPS8
## 6 YBR128C YAL002W 0.174 YBR12… ATG14 VPS8
## 7 YDL214C YAL002W 0.0256 YDL21… PRR2 VPS8
## 8 YNL300W YAL002W 0.509 YNL30… TOS6 VPS8
## 9 YIL034C YAL002W 1.31 YIL03… CAP2 VPS8
## 10 YHR083W_TS… YAL002W -0.240 YHR08… SAM35 VPS8
## # ... with 4,364,249 more rows
write_tsv(ubergenes.ubermap.gene_names_both_sig, path = "basic_E-MAP_data/preprocessed_ubermap_ubergenes_only_significant.txt")
#### merge the ubergenes.ubermap.gene_names_both and ubermap.mutants.only into one tibble
(combined.emap.data <- bind_rows(ubermap.mutants.only, ubergenes.ubermap.gene_names_both))
## # A tibble: 5,996,950 x 6
## Gene_uniq library.ORF score ORF Gene.gene_name library.gene_na…
## <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 A180T YAL002W 0.974 YLR2… GSP1 VPS8
## 2 CTER3XFLAG WT YAL002W -0.616 YLR2… GSP1 VPS8
## 3 D79A YAL002W 0.465 YLR2… GSP1 VPS8
## 4 D79S YAL002W 0.357 YLR2… GSP1 VPS8
## 5 E115A YAL002W -0.740 YLR2… GSP1 VPS8
## 6 E115I YAL002W -1.85 YLR2… GSP1 VPS8
## 7 F58A YAL002W 0.587 YLR2… GSP1 VPS8
## 8 F58L YAL002W -0.189 YLR2… GSP1 VPS8
## 9 G80A YAL002W -0.0453 YLR2… GSP1 VPS8
## 10 GSP1-NAT YAL002W 1.74 YLR2… GSP1 VPS8
## # ... with 5,996,940 more rows
tail(combined.emap.data)
## # A tibble: 6 x 6
## Gene_uniq library.ORF score ORF Gene.gene_name library.gene_name
## <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 YDR247W YPR189W 0.582 YDR247W VHS1 SKI3
## 2 YHR086W YPR189W -0.660 YHR086W NAM8 SKI3
## 3 YBR015C YPR189W 0.634 YBR015C MNN2 SKI3
## 4 YML055W YPR189W 0.801 YML055W SPC2 SKI3
## 5 YGL190C YPR189W 2.96 YGL190C CDC55 SKI3
## 6 YLR176C YPR189W -0.154 YLR176C RFX1 SKI3
write_tsv(combined.emap.data, path = "basic_E-MAP_data/preprocessed_ubermap_all.txt")
###### only significant subset
(combined.emap.data_sig <- combined.emap.data %>%
filter(library.ORF %in% significant.library.genes))
## # A tibble: 4,421,902 x 6
## Gene_uniq library.ORF score ORF Gene.gene_name library.gene_na…
## <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 A180T YAL002W 0.974 YLR2… GSP1 VPS8
## 2 CTER3XFLAG WT YAL002W -0.616 YLR2… GSP1 VPS8
## 3 D79A YAL002W 0.465 YLR2… GSP1 VPS8
## 4 D79S YAL002W 0.357 YLR2… GSP1 VPS8
## 5 E115A YAL002W -0.740 YLR2… GSP1 VPS8
## 6 E115I YAL002W -1.85 YLR2… GSP1 VPS8
## 7 F58A YAL002W 0.587 YLR2… GSP1 VPS8
## 8 F58L YAL002W -0.189 YLR2… GSP1 VPS8
## 9 G80A YAL002W -0.0453 YLR2… GSP1 VPS8
## 10 GSP1-NAT YAL002W 1.74 YLR2… GSP1 VPS8
## # ... with 4,421,892 more rows
write_tsv(combined.emap.data_sig, path = "basic_E-MAP_data/preprocessed_ubermap_all_sig.txt")
######### non merged ubermap
not.merged.ubermap <- read_tsv("basic_E-MAP_data/ORF_names_ubermap_not_merged.txt", col_names = T)
## Warning: Duplicated column names deduplicated: 'YDR113C' =>
## 'YDR113C_1' [911], 'YLR350W' => 'YLR350W_1' [991], 'YDR432W' =>
## 'YDR432W_1' [1216], 'YGR108W' => 'YGR108W_1' [1303], 'YDR113C' =>
## 'YDR113C_2' [1312], 'YLR350W' => 'YLR350W_2' [1347], 'YGR109C' =>
## 'YGR109C_1' [2039], 'YDR432W' => 'YDR432W_2' [2523], 'YDR113C' =>
## 'YDR113C_3' [2725], 'YER008C' => 'YER008C_1' [2736], 'YPR120C' =>
## 'YPR120C_1' [2823], 'YGR038W' => 'YGR038W_1' [3084], 'YBR160W' =>
## 'YBR160W_1' [3308], 'YBR088C' => 'YBR088C_1' [3371], 'YPR119W' =>
## 'YPR119W_1' [3503], 'YDL155W' => 'YDL155W_1' [3541], 'YDL084W' =>
## 'YDL084W_1' [3548], 'YDR432W' => 'YDR432W_3' [3567], 'YGR038W' =>
## 'YGR038W_2' [3737], 'YJL085W' => 'YJL085W_1' [4140], 'YBR088C' =>
## 'YBR088C_2' [4448], 'YPL153C' => 'YPL153C_1' [4596]
## Parsed with column specification:
## cols(
## .default = col_double(),
## Gene = col_character()
## )
## See spec(...) for full column specifications.
#### warning message when reading in the non-merged ubermap
# Warning message:
# Duplicated column names deduplicated: 'YDR113C' => 'YDR113C_1' [911], 'YLR350W' => 'YLR350W_1' [991], 'YDR432W' => 'YDR432W_1' [1216], 'YGR108W' => 'YGR108W_1' [1303], 'YDR113C' => 'YDR113C_2' [1312], 'YLR350W' => 'YLR350W_2' [1347], 'YGR109C' => 'YGR109C_1' [2039], 'YDR432W' => 'YDR432W_2' [2523], 'YDR113C' => 'YDR113C_3' [2725], 'YER008C' => 'YER008C_1' [2736], 'YPR120C' => 'YPR120C_1' [2823], 'YGR038W' => 'YGR038W_1' [3084], 'YBR160W' => 'YBR160W_1' [3308], 'YBR088C' => 'YBR088C_1' [3371], 'YPR119W' => 'YPR119W_1' [3503], 'YDL155W' => 'YDL155W_1' [3541], 'YDL084W' => 'YDL084W_1' [3548], 'YDR432W' => 'YDR432W_3' [3567], 'YGR038W' => 'YGR038W_2' [3737], 'YJL085W' => 'YJL085W_1' [4140], 'YBR088C' => 'YBR088C_2' [4448], 'YPL153C' => 'YPL153C_1' [4596]
### tidyverse sorts out duplicated library genes successfully
not.merged.ubermap %>%
ggplot(aes(x = YDR113C_1, y = YDR113C)) + geom_point()
## Warning: Removed 4702 rows containing missing values (geom_point).

#### can't gather before sorting out the Genes that occur multiple times (add a unique identifier to them)
### first find those Genes that are not unique (probably data from separate experiments/labs for the same thing? Stll better not to mix them up)
(not_uniq <- not.merged.ubermap %>% group_by(Gene) %>%
summarize("count" = n()) %>% filter(count > 1))
## # A tibble: 15 x 2
## Gene count
## <chr> <int>
## 1 YBR088C 3
## 2 YBR160W 2
## 3 YDL084W 2
## 4 YDL155W 2
## 5 YDR113C 4
## 6 YDR432W 4
## 7 YER008C 2
## 8 YGR038W 3
## 9 YGR108W 2
## 10 YGR109C 2
## 11 YJL085W 2
## 12 YLR350W 3
## 13 YPL153C 2
## 14 YPR119W 2
## 15 YPR120C 2
for (temp.gene in not_uniq$Gene) {
row.names <- which(not.merged.ubermap$Gene == temp.gene)
for (i in seq_along(row.names)) {
not.merged.ubermap$Gene[row.names[i]] <- paste(i, not.merged.ubermap$Gene[row.names[i]], sep = "_")
}
}
### now those few non-unique Genes look like this: 1_YDL155W - YDL155W
#### check
filter(not.merged.ubermap, Gene == "1_YJL085W" | Gene == "2_YJL085W")
## # A tibble: 2 x 5,398
## Gene YLR268W YOR011W YLR359W_DAmP YAL008W YOR043W YBR255W YBR128C
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1_YJL085W 2.00 0.0066 -1.13 -0.0527 NA NA 1.13
## 2 2_YJL085W NA -0.342 NA -1.91 -4.02 1.13 1.32
## # ... with 5,390 more variables: YDL214C <dbl>, YNL300W <dbl>,
## # YIL034C <dbl>, YHR083W_TSQ493 <dbl>, YGR164W <dbl>, YGR131W <dbl>,
## # YLR144C <dbl>, YNL003C <dbl>, YBR160W <dbl>, YJL082W <dbl>,
## # YIL044C <dbl>, YJL142C <dbl>, YPL191C <dbl>, YGL008C_DAmP <dbl>,
## # YGL215W <dbl>, YDR312W <dbl>, YLR348C <dbl>, YKL074C <dbl>,
## # YNR067C <dbl>, YJL077C <dbl>, `YKL096W-A` <dbl>, YFL010C <dbl>,
## # YDR141C <dbl>, YIL124W <dbl>, YPL039W <dbl>, YJL081C_DAmP <dbl>,
## # YKL048C <dbl>, YDR083W <dbl>, YIR013C <dbl>, YPL071C <dbl>,
## # YKL183W <dbl>, YPL024W <dbl>, YKL134C <dbl>, YGR063C <dbl>,
## # YKL029C <dbl>, YPL166W <dbl>, YLR220W <dbl>, YBR273C <dbl>,
## # YHR025W <dbl>, YOR313C <dbl>, YAL026C <dbl>, YHR007C_TSQ14 <dbl>,
## # YBL008W <dbl>, YGL237C <dbl>, YCL023C <dbl>, YAR035W <dbl>,
## # YLR145W_DAmP <dbl>, YOL129W <dbl>, YHR130C <dbl>, YOL116W <dbl>,
## # YBR182C <dbl>, YGL044C_DAmP <dbl>, YDR196C <dbl>, YBR280C <dbl>,
## # YNR032W <dbl>, YML105C <dbl>, YNL288W <dbl>, YPL104W <dbl>,
## # YIL059C <dbl>, YLR253W <dbl>, YDL140C_DAmP <dbl>, YNR018W <dbl>,
## # YPL144W <dbl>, YFR047C <dbl>, YBR215W <dbl>, YGL142C_DAmP <dbl>,
## # YHR078W <dbl>, YGR084C <dbl>, YJR115W <dbl>, YCR002C <dbl>,
## # YGL157W <dbl>, YBL015W <dbl>, YLR300W <dbl>, YBL003C <dbl>,
## # YJR140C <dbl>, YGR146C <dbl>, YOR033C <dbl>, YOR062C <dbl>,
## # YOR290C_DAMP <dbl>, YBR043C <dbl>, YHR112C <dbl>, YNR007C <dbl>,
## # YLR011W <dbl>, YFL008W_TSQ321 <dbl>, YKR103W <dbl>, YKR084C <dbl>,
## # YER121W <dbl>, YLR059C <dbl>, YDR059C <dbl>, YOR240W <dbl>,
## # YLR105C_DAmP <dbl>, YKL204W <dbl>, YGR291C <dbl>, YPL248C <dbl>,
## # YMR233W <dbl>, YIR022W_DAmP <dbl>, YDR393W <dbl>, YPR194C <dbl>,
## # YGR019W <dbl>, YGL244W <dbl>, …
# not_uniq is now empty
(not_uniq <- not.merged.ubermap %>% group_by(Gene) %>%
summarize("count" = n()) %>% filter(count > 1))
## # A tibble: 0 x 2
## # ... with 2 variables: Gene <chr>, count <int>
not.merged.ubermap <- not.merged.ubermap %>%
gather(library.ORF, score, -Gene) %>%
drop_na(score) %>%
rename("query_uniq" = "Gene") %>%
mutate("query.ORF" = gsub(query_uniq, pattern = " - DELTA", replacement = " -- DELTA", perl = T)) %>% ##### this is to make sure things like YLR337C - DELTA YLR338W - YLR337C - DELTA YLR338W are not collapse to just YLR337C in the next step
mutate("query.ORF" = gsub(query.ORF, pattern = "^[0-9]+_", replacement = "", perl = T)) %>%
mutate("query.ORF" = gsub(query.ORF, pattern = "_.+$", replacement = "", perl = T))
filter(not.merged.ubermap, query_uniq != query.ORF)
## # A tibble: 1,535,738 x 4
## query_uniq library.ORF score query.ORF
## <chr> <chr> <dbl> <chr>
## 1 YGL008C_DAmP YLR268W 1.22 YGL008C
## 2 YJL081C_DAmP YLR268W 0.645 YJL081C
## 3 YDL140C_DAmP YLR268W -2.48 YDL140C
## 4 YGL142C_DAmP YLR268W 0.309 YGL142C
## 5 YDL165W_DAmP YLR268W -0.781 YDL165W
## 6 YDR141C_DAmP YLR268W -8.36 YDR141C
## 7 YDR407C_DAmP YLR268W 0.584 YDR407C
## 8 YMR236W_DAmP YLR268W 3.47 YMR236W
## 9 YJL034W_DAmP YLR268W 0.118 YJL034W
## 10 YDR190C_DAmP YLR268W -6.28 YDR190C
## # ... with 1,535,728 more rows
not.merged.ubermap.gene.names <- inner_join(not.merged.ubermap, orf_index, by = c("query.ORF" = "orf")) %>%
rename("query_gene_name" = "gene_name")
#### confirm that all query_uniq are unique
##########################
# ubermap_per_lib_orf_count <- not.merged.ubermap.gene.names %>% group_by(library.ORF) %>%
# summarise("gene_count" = n())
# unique(ubermap_per_lib_orf_count$gene_count)
# ubermap_per_gene_count <- not.merged.ubermap.gene.names %>% group_by(query_uniq) %>%
# summarise("lib_count" = n())
# unique(ubermap_per_gene_count$lib_count)
# ubermap_per_gene_count %>%
# filter(lib_count > mean(ubermap_per_gene_count$lib_count)) %>%
# pull(query_uniq)
# ubermap_per_gene_count %>% filter(lib_count == 3072)
save(not.merged.ubermap.gene.names, file = "basic_E-MAP_data/ubermap_not_merged.RData")
##########################################
#### preprocess histone point mutations data
histones <- read_delim("basic_E-MAP_data/20180226_histones_pE-MAPs.txt", delim = "\t", col_names = T)
## Parsed with column specification:
## cols(
## .default = col_double(),
## Gene = col_character()
## )
## See spec(...) for full column specifications.
histones <- histones %>% gather(library.gene_name, score, -Gene)
write_tsv(histones, path = "basic_E-MAP_data/histones_pE-MAPs.txt")